## behavioral data:

d <- fread(here("in", "behavior-and-events_stroop_2021-10-20_nice.csv"))
subjs_behav <- as.data.table(table(d$subj, d$session, d$wave))
names(subjs_behav) <- c("subj", "session", "wave", "nrow_eprime")
# subjs_behav$is_missing <- subjs_behav$nrow_eprime == 0
# subjs_behav$is_incomplete <- subjs_behav$nrow_eprime < 216




## fmri data (list of subjs who have):

fnames_subjlist <- list.files(here("in/subjs"), pattern = "^subjlist", full.names = TRUE)
names(fnames_subjlist) <- gsub("^.*stroop-rsa-pc/in/subjs/subjlist_fmri_(.*)\\.txt", "\\1", fnames_subjlist)
subjs_fmri <- rbindlist(lapply(fnames_subjlist, fread), idcol = "dmcc_session")
names(subjs_fmri)[2] <- "subj"
subjs_fmri <- separate(subjs_fmri, dmcc_session, c("dmcc", "session"))
subjs_fmri$wave <- ifelse(subjs_fmri$dmcc == "dmcc2", "wave1", ifelse(subjs_fmri$dmcc == "dmcc3", "wave2", "wave3"))
subjs_fmri$dmcc <- NULL
subjs_fmri <- as.data.table(subjs_fmri)
subjs_fmri$needs_rerun <- grepl("_", subjs_fmri$subj)
subjs_fmri$subj <- gsub("(.*[0-9])_(.*)", "\\1", subjs_fmri$subj)


## plot fmri and behav data

mat_fmri <- table(subjs_fmri$subj, paste0(subjs_fmri$wave, "_", subjs_fmri$session)) %>%
  as.data.frame %>%
  rename(subj = Var1, wave_session = Var2) %>%
  mutate(id = "fmri")
mat_behav <- subjs_behav %>% 
  mutate(wave_session = paste0(wave, "_", session), Freq = nrow_eprime %in% c(216, 240), id = "behav") %>%
  select(subj, wave_session, Freq, id)
mat <- full_join(mat_fmri, mat_behav) 
## Joining, by = c("subj", "wave_session", "Freq", "id")
mat %>%
  mutate(wave_session = gsub("wave([1-3])_(...).*", "w\\1_\\2", wave_session)) %>%
  ggplot(aes(wave_session, subj, fill = Freq)) +
  geom_raster() +
  facet_grid(cols = vars(id)) +
  scale_fill_gradient(low = "white", high = "black") +
  theme(legend.position = "none") +
  labs(x = "wave_session", y = "subj", title = "missing (white)")

## MB8 subjs

subjs_mb8 <- c("162026", "179245", "182840", "198855", "205220", "214524", "568963", "623844")


## twins

# from /data/nil-external/ccp/JosetAEtzel/DMCC_files/getPairs.Rsubjs_twins

subjs_mz <- data.table(
  a = c(
    "DMCC1624043", "DMCC1971064", "233326", "209228", "DMCC4260551", "DMCC6755891", "DMCC6904377", "DMCC9850294",
    "162026", "DMCC8078683", "DMCC7297690", "DMCC6960387", "DMCC5065053", "DMCC5820265", "DMCC3206338"
    ),
  b = c(
    "DMCC3204738", "DMCC3509558", "352738", "765864", "DMCC7921988", "DMCC8050964", "DMCC9953810", "DMCC2560452",
    "568963", "DMCC6484785", "DMCC1596165", "DMCC8260571", "DMCC4368773", "DMCC3091953", "DMCC4854984"
    )
)
subjs_mz_mbsr <- data.table(
  a = c(
    "182436", "115825", "250427", "171330", "DMCC2834766", "DMCC2803654", "DMCC4191255", "562345",
    "DMCC5195268", "DMCC6661074", "DMCC5775387", "DMCC6371570", "DMCC2442951", "DMCC1328342",
    "DMCC3963378", "DMCC3876181", "DMCC5244053"
    ),
  b = c(
    "178647", "178243", "877168", "393550", "DMCC6671683", "DMCC9478705", "DMCC6418065", "130518",
    "DMCC8033964", "DMCC6705371", "DMCC6721369", "DMCC9441378", "DMCC6627478", "DMCC5009144",
    "DMCC2609759", "DMCC0472647", "DMCC8760894"
    )
  
)
subjs_dz <- data.table(
  a = c("198855", "DMCC8214059"),
  b = c("623844", "DMCC3062542")
)
subjs_dz_mbsr <- data.table(
  a = c("130114"),
  b = c("155938")
)
subjs_twins <- rbindlist(list(mz = subjs_mz, mz = subjs_mz_mbsr, dz = subjs_dz, dz = subjs_dz_mbsr), idcol = "twin")
subjs_twins[, twinpair := 1:.N]
subjs_twins <- melt(subjs_twins, id.vars = c("twin", "twinpair"), value.name = "subj")[, -"variable"]



## poor quality

# from: https://github.com/ccplabwustl/R01/blob/master/Jo/datasetQC/blocksTasksToDrop.txt

subjs_jolist <- data.table(
  rbind(
    ## DMCC2 for behavior ("sleeping"):
    c("behavior", "wave1", "233326", "proactive"),
    c("behavior", "wave1", "DMCC6371570", "reactive"),
    # DMCC2 for movement (too much censoring):
    c("movement", "wave1", "873968", "proactive"),
    c("movement", "wave1", "DMCC1971064", "proactive"),
    # DMCC3 for behavior ("sleeping"):
    c("behavior", "wave2", "161832", "baseline")
  )
)
names(subjs_jolist) <- c("issue", "wave", "subj", "session")



## bind

subjs <- full_join(left_join(full_join(subjs_behav, subjs_fmri), subjs_twins), subjs_jolist)
## Joining, by = c("subj", "session", "wave")
## Joining, by = "subj"
## Joining, by = c("subj", "session", "wave")

initial exclusions

missing and incomplete fmri runs

subjs_samp <- subjs[nrow_eprime %in% c(216, 240)]

subjs[nrow_eprime > 0 & nrow_eprime < 216]
##      subj  session  wave nrow_eprime needs_rerun twin twinpair issue
## 1: 178243 baseline wave1         189       FALSE   mz       17  <NA>

One subj has a partially complete set of runs in wave 1 baseline. Exclude.

all MB8 subjects

kable(subjs_samp[subj %in% subjs_mb8])
subj session wave nrow_eprime needs_rerun twin twinpair issue
162026 baseline wave1 216 FALSE mz 9 NA
179245 baseline wave1 216 FALSE NA NA NA
182840 baseline wave1 216 FALSE NA NA NA
198855 baseline wave1 216 NA dz 33 NA
205220 baseline wave1 216 FALSE NA NA NA
214524 baseline wave1 216 NA NA NA NA
568963 baseline wave1 216 FALSE mz 9 NA
623844 baseline wave1 216 FALSE dz 33 NA
162026 proactive wave1 216 FALSE mz 9 NA
179245 proactive wave1 216 FALSE NA NA NA
198855 proactive wave1 216 NA dz 33 NA
205220 proactive wave1 216 FALSE NA NA NA
214524 proactive wave1 216 NA NA NA NA
568963 proactive wave1 216 FALSE mz 9 NA
623844 proactive wave1 216 NA dz 33 NA
162026 reactive wave1 240 FALSE mz 9 NA
179245 reactive wave1 240 FALSE NA NA NA
182840 reactive wave1 240 FALSE NA NA NA
198855 reactive wave1 240 NA dz 33 NA
205220 reactive wave1 240 FALSE NA NA NA
214524 reactive wave1 240 NA NA NA NA
568963 reactive wave1 240 FALSE mz 9 NA
623844 reactive wave1 240 FALSE dz 33 NA
568963 baseline wave2 216 FALSE mz 9 NA
568963 proactive wave2 216 FALSE mz 9 NA
568963 reactive wave2 240 FALSE mz 9 NA
subjs_samp <- subjs_samp[!subj %in% subjs_mb8]

568963 was MB8 in wave 1, but MB4 in wave 2. Exclude this subj’s wave2 data for now (but could include later).

quality-based exclusions

movement

kable(subjs_samp[issue == "movement"])
subj session wave nrow_eprime needs_rerun twin twinpair issue
873968 proactive wave1 216 FALSE NA NA movement
DMCC1971064 proactive wave1 216 FALSE mz 2 movement
subjs_samp <- subjs_samp[is.na(issue) | issue != "movement"]

Two wave\(\cdot\)session\(\cdot\)subjs have excessive movements, as defined by Jo’s criteria (more than 20% of frames censored, i.e., FD > 0.9). Excluded.

RT

d_rt <- d %>% 
  right_join(subjs_samp) %>%
  filter(rt > 0, acc == 1)
## Joining, by = c("wave", "session", "subj")
d_rt %>%
  
  ggplot(aes(rt, subj, fill = wave, color = wave)) +
  geom_density_ridges(alpha = 0.2) +
  facet_grid(cols = vars(session)) +
  
  scale_fill_viridis_d() +
  scale_color_viridis_d()
## Picking joint bandwidth of 43.4
## Picking joint bandwidth of 42.3
## Picking joint bandwidth of 42.6

l <- split(d_rt, droplevels(interaction(d_rt$subj, d_rt$wave, d_rt$session)))
d_rt$rt_resid <- unlist(lapply(l, function(x) resid(lm(rt ~ item, x))))

d_rt %>% 
  
  # filter(subj %in% "849971") %>%
  
  ggplot(aes(sample = rt_resid)) +
  geom_qq(size = 0.75, shape = 16) +
  geom_qq_line() +
  facet_grid(rows = vars(subj), cols = vars(wave, session))

Not much looks weird from this angle.

Error

d_er <- d %>% right_join(subjs_samp)
## Joining, by = c("wave", "session", "subj")
sum_er <- d_er %>%
  
  group_by(subj, session, wave) %>%
  mutate(total = n()) %>%
  group_by(subj, session, wave, acc_final) %>%
  summarize(
    total = unique(total),
    percent = sum(n()) / total*100
    )
## `summarise()` has grouped output by 'subj', 'session', 'wave'. You can override using the `.groups` argument.
sum_er %>%
  
  filter(acc_final != "1") %>%
  
  ggplot(aes(percent, subj, color = acc_final)) +
  geom_point(size = 2) +
  geom_vline(xintercept = 10, color = "grey60") +
  facet_grid(cols = vars(wave, session)) +
  
  scale_color_viridis_d() +
  
  theme(legend.position = c(0.8, 0.1))

Line shows 10% threshold for “no response” used in Freund et al. (2021) JNeurosci.

bad <- sum_er %>% filter(acc_final == "no.response" & percent > 10)

bad
## # A tibble: 8 x 6
## # Groups:   subj, session, wave [8]
##   subj        session   wave  acc_final   total percent
##   <chr>       <chr>     <chr> <chr>       <int>   <dbl>
## 1 160830      reactive  wave1 no.response   240    15.4
## 2 161832      baseline  wave2 no.response   216    37.0
## 3 161832      reactive  wave2 no.response   240    27.9
## 4 197449      baseline  wave1 no.response   216    17.1
## 5 197449      baseline  wave3 no.response   216    13.4
## 6 233326      proactive wave1 no.response   216    14.4
## 7 DMCC5009144 baseline  wave2 no.response   216    13.4
## 8 DMCC6371570 reactive  wave1 no.response   240    32.5
sum(subjs_samp$issue == "behavior", na.rm = TRUE)
## [1] 3
subjs_samp <- subjs_samp %>%
  filter(!paste0(subj, session, wave) %in% paste0(bad$subj, bad$session, bad$wave)) %>%
  as.data.table

With that threshold, 3 wave\(\cdot\)session\(\cdot\)subjs don’t meet criteria. Three of those subjects were also ID’d by Jo’s (less strict) “sleeping” criteria. (Her criteria didn’t ID any others not picked up by the 10% threshold.)

session\(\cdot\)wave splits

mat_good_data <- 
  table(subjs_samp$subj, paste0(subjs_samp$wave, "_", subjs_samp$session)) %>%
  as.data.frame %>%
  rename(subj = Var1, wave_session = Var2)
x <- as.matrix(table(subjs_samp$subj, paste0(subjs_samp$wave, "_", subjs_samp$session)))
d <- dist(x, method = "euclidean") # distance matrix
fit <- hclust(d, method="ward.D")
# plot(fit)
groups <- cutree(fit, k = 5) # cut tree into 5 clusters
mat_good_data$subj <- factor(mat_good_data$subj, levels = names(sort(groups)))


mat_good_data %>%
  mutate(wave_session = gsub("wave([1-3])_(...).*", "w\\1_\\2", wave_session)) %>%
  ggplot(aes(wave_session, subj, fill = Freq)) +
  geom_raster() +
  scale_fill_gradient(low = "white", high = "black") +
  theme(legend.position = "none") +
  labs(x = "wave_session", y = "subj", title = "missing (white)")

baseline–proactive: BP

B1P1

twins_in_set <- function(x) x[duplicated(x)]

subjs_bp <- subjs_samp[wave == "wave1" & session %in% c("baseline", "proactive")]
subjs_both_b1p1 <- subjs_bp$subj[duplicated(subjs_bp$subj)]  ## get subjs in both
subjs_bp <- subjs_bp[subj %in% subjs_both_b1p1]

n_b1p1 <- length(unique(subjs_bp$subj))  ## wave 1 n subjs
twins_b1p1 <- twins_in_set(subjs_bp[session == "baseline" & !is.na(twin)]$twinpair)  ## wave 1 n twinpairs
n_twins_b1p1 <- length(twins_b1p1)  ## wave 1 n twinpairs

n_b1p1  ## n total
## [1] 85
n_twins_b1p1  ## n twinpairs
## [1] 21
n_b1p1 - n_twins_b1p1  ## n unrelated
## [1] 64

B2P2, B3P3

subjs_bp2 <- subjs_samp[wave == "wave2" & session %in% c("baseline", "proactive")]
subjs_both_b2p2 <- subjs_bp2$subj[duplicated(subjs_bp2$subj)]  ## get subjs in both
subjs_bp2 <- subjs_bp2[subj %in% subjs_both_b2p2]

n_b2p2 <- length(unique(subjs_bp2$subj))  ## wave 2 n subjs
twins_b2p2 <- twins_in_set(subjs_bp2[session == "baseline" & !is.na(twin)]$twinpair)
n_twins_b2p2 <- length(twins_b2p2)  ## wave 1 n twinpairs

n_b2p2  ## n total
## [1] 41
n_twins_b2p2  ## n twinpairs
## [1] 13
n_b2p2 - n_twins_b2p2  ## n unrelated
## [1] 28
length(intersect(subjs_bp2$subj, subjs_bp$subj))  ## subjs in BP2 that were in BP2
## [1] 36
subjs_bp3 <- subjs_samp[wave == "wave3" & session %in% c("baseline", "proactive")]
subjs_both_b3p3 <- subjs_bp3$subj[duplicated(subjs_bp3$subj)]  ## get subjs in both
subjs_bp3 <- subjs_bp3[subj %in% subjs_both_b3p3]

n_b3p3 <- length(unique(subjs_bp3$subj))  ## wave 3 n subjs
twins_b3p3 <- twins_in_set(subjs_bp3[session == "baseline" & !is.na(twin)]$twinpair)  ## wave 1 n twinpairs
n_twins_b3p3 <- length(twins_b3p3)  ## wave 1 n twinpairs

n_b3p3  ## n total
## [1] 5
n_twins_b3p3  ## n twinpairs
## [1] 1
n_b3p3 - n_twins_b3p3  ## n unrelated
## [1] 4

proactive–baseline: PB

B2P1

subjs_b2p1 <- subjs_samp[(wave == "wave1" & session %in% "proactive") | (wave == "wave2" & session %in% "baseline")]
subjs_both_b2p1 <- subjs_b2p1$subj[duplicated(subjs_b2p1$subj)]  ## get subjs in both
subjs_b2p1 <- subjs_b2p1[subj %in% subjs_both_b2p1]

n_b2p1 <- length(unique(subjs_b2p1$subj))  ## wave 2 n subjs
twins_b2p1 <- twins_in_set(subjs_b2p1[session == "baseline" & !is.na(twin)]$twinpair)
n_twins_b2p1 <- length(twins_b2p1)  ## wave 1 n twinpairs

n_b2p1  ## n total
## [1] 42
n_twins_b2p1  ## n twinpairs
## [1] 13
n_b2p1 - n_twins_b2p1  ## n unrelated
## [1] 29
subjs_b3p1 <- subjs_samp[(wave == "wave1" & session %in% "proactive") | (wave == "wave3" & session %in% "baseline")]
subjs_both_b3p1 <- subjs_b3p1$subj[duplicated(subjs_b3p1$subj)]  ## get subjs in both
subjs_b3p1 <- subjs_b3p1[subj %in% subjs_both_b3p1]

n_b3p1 <- length(unique(subjs_b3p1$subj))  ## wave 2 n subjs
twins_b3p1 <- twins_in_set(subjs_b3p1[session == "baseline" & !is.na(twin)]$twinpair)  ## wave 1 n twinpairs
n_twins_b3p1 <- length(twins_b3p1)  ## wave 1 n twinpairs

n_b3p1  ## n total
## [1] 5
n_twins_b3p1  ## n twinpairs
## [1] 1
n_b3p1 - n_twins_b3p1  ## n unrelated
## [1] 4

reactive session

wave1

subjs_r1 <- subjs_samp[wave == "wave1" & session == "reactive"]

n_r1 <- length(unique(subjs_r1$subj))  ## wave 1 n subjs
twins_r1 <- twins_in_set(subjs_r1[session == "reactive" & !is.na(twin)]$twinpair)  ## wave 1 n twinpairs
n_twins_r1 <- length(twins_r1)  ## wave 1 n twinpairs

n_r1  ## n total
## [1] 91
n_twins_r1  ## n twinpairs
## [1] 23
n_r1 - n_twins_r1  ## n unrelated
## [1] 68

reactive test–retest

subjs_r1r2 <- subjs_samp[session == "reactive" & wave %in% c("wave1", "wave2")]
subjs_both_r1r2 <- subjs_r1r2$subj[duplicated(subjs_r1r2$subj)]  ## get subjs in both
subjs_r1r2 <- subjs_r1r2[subj %in% subjs_both_r1r2]

n_r1r2 <- length(unique(subjs_r1r2$subj))  ## wave 1 n subjs
twins_r1r2 <- twins_in_set(subjs_r1r2[wave == "wave1" & !is.na(twin)]$twinpair)  ## wave 1 n twinpairs
n_twins_r1r2 <- length(twins_r1r2)  ## wave 1 n twinpairs

n_r1r2  ## n total
## [1] 39
n_twins_r1r2  ## n twinpairs
## [1] 12
n_r1r2 - n_twins_r1r2  ## n unrelated
## [1] 27

define groups

out <- subjs_samp[wave %in% c("wave1", "wave2"), -c("nrow_eprime", "twin", "issue")]

sample_cotwins <- function(x) {
  set.seed(0)
  twinpairs <- unique(x$twinpair[duplicated(x$twinpair)])
  x <- x %>% filter(twinpair %in% twinpairs)  ## only twins in sample
  x %>%
    group_by(twinpair) %>%
    sample_n(1) %>%
    pull(subj)
}

mc_mi groups

cotwins_b1p1 <- sample_cotwins(subjs_twins[twinpair %in% twins_b1p1])  ## to exclude
out$is_mc_mi <- FALSE
out[
  wave == "wave1" & session %in% c("baseline", "proactive") & subj %in% setdiff(subjs_both_b1p1, cotwins_b1p1)
  ]$is_mc_mi <- TRUE

out$is_mc_mi_cotwin <- FALSE
out[
  wave == "wave1" & session %in% c("baseline", "proactive") & subj %in% cotwins_b1p1
  ]$is_mc_mi_cotwin <- TRUE


sum(out$is_mc_mi) / 2
## [1] 64
sum(out$is_mc_mi_cotwin) / 2
## [1] 21

mi_mc groups

### order control

cotwins_b2p1 <- sample_cotwins(subjs_twins[twinpair %in% twins_b2p1])  ## to exclude
out$is_mi_mc <- FALSE
out[
    ((wave == "wave2" & session %in% "baseline") | (wave == "wave1" & session %in% "proactive")) &
    subj %in% setdiff(subjs_both_b2p1, cotwins_b2p1)
  ]$is_mi_mc <- TRUE

out$is_mi_mc_cotwin <- FALSE
out[
    ((wave == "wave2" & session %in% "baseline") | (wave == "wave1" & session %in% "proactive")) &
    subj %in% cotwins_b2p1
  ]$is_mi_mc_cotwin <- TRUE


sum(out$is_mi_mc) / 2
## [1] 29
sum(out$is_mi_mc_cotwin) / 2
## [1] 13
## retest 

subjs_bp_retest <- intersect(subjs_both_b2p2, subjs_both_b1p1)
cotwins_bp_retest <- sample_cotwins(subjs_twins[subj %in% subjs_bp_retest])  ## to exclude
out$is_mc_mi_retest <- FALSE
out[
    session %in% c("baseline", "proactive") & subj %in% setdiff(subjs_bp_retest, cotwins_bp_retest)
  ]$is_mc_mi_retest <- TRUE

out$is_mc_mi_retest_cotwin <- FALSE
out[
    session %in% c("baseline", "proactive") & subj %in% cotwins_bp_retest
  ]$is_mc_mi_retest_cotwin <- TRUE


sum(out$is_mc_mi_retest) / 4
## [1] 27
sum(out$is_mc_mi_retest_cotwin) / 4
## [1] 9

ispc groups

subjs_r_retest <- unique(subjs_r1r2$subj)
cotwins_r_retest <- sample_cotwins(subjs_twins[subj %in% subjs_r_retest])  ## to exclude
out$is_ispc_retest <- FALSE
out[
    session %in% c("reactive") & subj %in% setdiff(subjs_r_retest, cotwins_r_retest)
  ]$is_ispc_retest <- TRUE

out$is_ispc_retest_cotwin <- FALSE
out[
    session %in% c("reactive") & subj %in% cotwins_r_retest
  ]$is_ispc_retest_cotwin <- TRUE


sum(out$is_ispc_retest) / 2
## [1] 27
sum(out$is_ispc_retest_cotwin) / 2
## [1] 12

write

fwrite(out, here("out", "subjlist.csv"))